# set working directory
setwd('~/replication_files')

# function to load and install packages
load_packages<-function(packages){
  sapply(packages,function(x){
    f<-require(x,character.only=T)
    if(!f) install.packages(x); f<-require(x,character.only=T)
    return(f)},USE.NAMES=T)
}

# load packages
load_packages(c('data.table','gtools','sandwich','miceadds'))

# format numbers to specific decimal places
specify_decimal<-function(x,ndec=3,drop0leading=F,drop0trailing=F,comma=F){
  f<-format(round(x,ndec),nsmall=ndec,drop0trailing=drop0trailing,scientific=F,big.mark=ifelse(comma,",",""))
  if(drop0leading){
    f<-sub("0.",".",f,fixed=T)
  }
  return(trimws(f))
}

# create stars depending on p-value
stars_pval<-function (p.value) {
  unclass(symnum(p.value, corr = FALSE, na = FALSE, cutpoints = c(0, 0.001, 0.01, 0.05, 1), symbols = c("***", "**", "*", "")))
}

(FinDT<-readRDS('Tables3A1.rds'))

# create dependent variables
(DVs<-rep(c('provax_bin','provax_count','antivax_bin','antivax_count'),each=2))
# number of models equal to number of DVs
(Nmods<-length(DVs))

# main independent variables for models
(IVs_main<-rep(list(c('college','female','nonwhite','age30t44','age45t59','age60p','parent','caseid_file_date_N','log(caseid_N)'),
                    c('college','female','nonwhite','age30t44','age45t59','age60p','parent','caseid_file_date_N','log(caseid_N)','vaccine_scale')),4))
# add data name as control
(IVs_all<-lapply(IVs_main,function(x) c(x,'Data_Name')))

# create formulas for models
(formulas<-lapply(1:Nmods,function(x) as.formula(paste0(DVs[x],'~',paste(IVs_all[[x]],collapse='+')))))

# not sure why but running lm.cluster first outside of lapply seems to be necessary to get it to run inside lapply:
lm.cluster(data=FinDT,formula=formulas[[1]],cluster='resp_id',weights=FinDT[['weight']])

# create list of model results
modlist<-lapply(1:Nmods,function(x){
  # OLS with errors clustered on respondent id with survey weights
  mod<-miceadds::lm.cluster(data=FinDT,formula=formulas[[x]],cluster='resp_id',weights=FinDT[['weight']])
  # get model summary
  modsum<-summary(mod)
  # extract coefficients
  (coefs<-modsum[c(IVs_main[[x]],'(Intercept)'),])
  
  # get r squared
  (R2<-specify_decimal(summary(mod$lm_res)$r.squared,3))
  # get number of observations
  (N<-paste0('\\multicolumn{1}{l}{',formatC(length(summary(mod$lm_res)$residuals),big.mark=','),'}'))
  
  # string of coefficients
  (Est<-specify_decimal(coefs[,1],3))
  # standard errors
  (SE<-paste0('(',specify_decimal(coefs[,2],3),')'))
  # p value stars
  (p<-stars_pval(coefs[,4]))
  # estimate with stars
  (Estp<-paste0(Est,'{$^{',p,'}$}'))
  # combine into data.table
  (dt<-data.table(rn=row.names(coefs),Estp=Estp,SE=SE))
  # fix names
  setnames(dt,2:3,paste0(c('Estp','SE'),x))
  # return list with the data.table of model results, r squared, and N
  list(dt=dt,R2=R2,N=N)
})

# create list of data.tables from model
(dtlist<-lapply(modlist,`[[`,1))

# merge all the data.tables by row name
(dtmerge<-Reduce(function(x,y) merge(x,y,by='rn',all=T),dtlist))
# create ordered data.table to merge by
(dtorder<-data.table(rn=c('college','female','nonwhite','age30t44','age45t59','age60p','parent','caseid_file_date_N','log(caseid_N)','vaccine_scale','(Intercept)')))
set(dtorder,NULL,'order',1:nrow(dtorder))

# merge model results with ordering data.table
(dtmerge<-merge(dtmerge,dtorder,by='rn'))
# reorder rows
setorder(dtmerge,order)
dtmerge

# fill in missing values with an empty space
(cols2fix<-setdiff(names(dtmerge),c('rn','order')))
for(x in cols2fix){
  set(dtmerge,dtmerge[,.I[is.na(get(x))]],which(names(dtmerge)==x),'')
}
dtmerge

# rearrange the results into a table format
(tabMain<-lapply(1:Nmods,function(x) c(rbind(dtmerge[[paste0('Estp',x)]],dtmerge[[paste0('SE',x)]]))))
(tabMain<-do.call(cbind,tabMain))
# collapse columns together with &
(tabMain<-apply(tabMain,1,function(x) paste(x,collapse=' & ')))

# names for coefficients on table
tabCoefs<-c('College','~graduate','Female','','Nonwhite','','Age 30--44','','Age 45--59','','Age 60+','','Parent','','Days in','~panel','log(URLs','~visited)','Vaccine','~favorability','Constant','')
# add coefficient labels to table
(tabMain<-paste(tabCoefs,tabMain,sep=' & '))
# add line break at end of lines
(tabMain<-paste0(tabMain,' \\\\'))
###
# get the R squareds
(modR2s<-lapply(modlist,`[[`,2))
# get the Ns
(modNs<-lapply(modlist,`[[`,3))

# create the bottom of the table with the R squared and N values
(tabBottom<-cbind(c('$R^2$','$N$'),rbind(unlist(modR2s),unlist(modNs))))
(tabBottom<-apply(tabBottom,1,function(x) paste(x,collapse=' & ')))
(tabBottom<-paste(tabBottom,'\\\\'))

# combine the main results and the other model information into another string
(tabContent<-c(tabMain,'\\midrule',tabBottom))

# head of Latex table
tab_head<-c('\\begin{table}[!htb]',
            '\\caption{Correlates of exposure to online vaccine-related information (behavioral data)}',
            '\\begin{center}',
            '\\begin{threeparttable}',
            '\\begin{tabular}{l *{8}{S[table-format=1.3,table-space-text-post={$^{***}$}]} }',
            '\\toprule',
            '& \\multicolumn{4}{c}{\\myuline{Non-skeptical webpages}} & \\multicolumn{4}{c}{\\myuline{Vaccine-skeptical webpages}} \\\\',
            '& \\multicolumn{2}{c}{Visited $\\geq 1$ page} & \\multicolumn{2}{c}{Total pages visited} & \\multicolumn{2}{c}{Visited $\\geq 1$ page} & \\multicolumn{2}{c}{Total pages visited} \\\\',
            '\\midrule')

# notes for Latex table
tab_notes<-paste('\\item \\leavevmode\\kern-\\scriptspace\\kern-\\labelsep',
                 '$^{*} p < 0.05$, $^{**} p < .01$, $^{***} p<.001$ (two-sided); OLS models with standard errors clustered by respondent and survey weights applied. All models include panel fixed effects. Data from YouGov Pulse panel participants with online traffic data available. Each observation represents behavioral web traffic data associated with responses to one of seven national surveys conducted from 2016--2019. Vaccine attitudes data were collected in prior surveys among a subset of respondents.')

# bottom of Latex table
tab_foot<-c('\\bottomrule',
            '\\end{tabular}',
            '\\begin{tablenotes}[flushleft]',
            tab_notes,
            '\\end{tablenotes}',
            '\\end{threeparttable}',
            '\\end{center}',
            '\\label{table:tab3}',
            '\\end{table}')

# final Latex table string
(tab_fin<-c(tab_head,tabContent,tab_foot))

# write table to file
writeLines(tab_fin,'Table3.tex')
